#!/usr/bin/env perl
use strict;
use warnings;
use threads;
use Thread::Queue;
use File::Basename;
use FindBin;
use Pod::Usage;

my $version = "v1.2";
# customized parameters
my $seq_path = ""; #specify the sequence file
my $finder_para = "-w 2 -C -D 15000 -d 1000 -L 7000 -l 100 -p 20 -M 0.85"; #specify ltr_finder parameters. The "-w 2" parameter is required.
my $harvest_format = 0; #0, print out format is in LTR_finder table output format (default); 1, LTRharvest screen output format
my $size_of_each_piece = 5000000; #5 Mb per piece
my $timeout = 1500; #set maximum time for a thread to run. After $timeout seconds, the child thread is killed.
my $try1 = 1; #1, further split to 50 Kb regions if thread killed by timeout. 0, no further split.
my $next = 0; #0, run LTR_FINDER. 1, only summarize results if the .finder folder is retained.
my $verbose = 0; #0, remove the .finder folder after finished; 1, retain the folder for later use.
my $threads = 4; #specify thread number to run ltr_finder
my $annotator = "LTR_FINDER_parallel"; #the annotator name used in gff output

# dependencies
#my $script_path = dirname(__FILE__);
my $script_path = $FindBin::Bin;
my $cut = "$script_path/bin/cut.pl"; #the program to cut sequence
my $convert = "$script_path/bin/convert_ltr_finder2.pl"; #the program to convert finder -w2 to harvest scn format
my $ltr_finder = ""; #the path to ltr_finder
my $check_dependencies = undef;
my $help = undef;

my $usage = "
~ ~ ~ Run LTR_FINDER in parallel ~ ~ ~

Author: Shujun Ou (shujun.ou.1\@gmail.com)
Date: 09/19/2018
Update: 01/28/2020
Version: $version

Usage: perl LTR_FINDER_parallel -seq [file] -size [int] -threads [int]
Options:	-seq	[file]	Specify the sequence file.
		-size	[int]	Specify the size you want to split the genome sequence.
				Please make it large enough to avoid spliting too many LTR elements. Default 5000000 (bp)
		-time	[int]	Specify the maximum time to run a subregion (a thread).
				This helps to skip simple repeat regions that take a substantial of time to run. Default: 1500 (seconds).
				Suggestion: 300 for -size 1000000. Increase -time when -size increased.
		-try1	[0|1]	If a region requires more time than the specified -time (timeout), decide:
					0, discard the entire region.
					1, further split to 50 Kb regions to salvage LTR candidates (default);
		-harvest_out	Output LTRharvest format if specified. Default: output LTR_FINDER table format.
		-next		Only summarize the results for previous jobs without rerunning LTR_FINDER (for -v).
		-verbose|-v	Retain LTR_FINDER outputs for each sequence piece.
		-finder	[file]	The path to the program LTR_FINDER (default v1.0.7, included in this package).
		-threads|-t	[int]	Indicate how many CPU/threads you want to run LTR_FINDER.
		-check_dependencies Check if dependencies are fullfiled and quit
		-help|-h	Display this help information.
\n";


# read user parameters
my $i=0;
foreach (@ARGV){
	$seq_path = $ARGV[$i+1] if /^-seq$/i;
	$size_of_each_piece = $ARGV[$i+1] if /^-size$/i;
	$timeout = $ARGV[$i+1] if /^-time$/i;
	$try1 = $ARGV[$i+1] if /^-try1$/i;
	$harvest_format = 1 if /^-harvest_out$/i;
	$next = 1 if /^-next$/i;
	$verbose = 1 if /^-verbose$|^-v$/i;
	$cut = $ARGV[$i+1] if /^-cut$/i;
	$ltr_finder = $ARGV[$i+1] if /^-finder$/i;
	$threads = $ARGV[$i+1] if /^-threads$|^-t$/i;
	$check_dependencies = 1 if /^-check_dependencies$/i;
	$help = 1 if /^-help$|^-h$/i;
	$i++;
	}

# check parameters
if ($help) {
    pod2usage( { -verbose => 0,
                 -exitval => 0,
                 -message => "$usage\n" } );
}

$ltr_finder=`which ltr_finder 2>/dev/null` if $ltr_finder eq '';
$ltr_finder = "$script_path/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder" if $ltr_finder eq ''; #use default ltr_finder if not ENV
$ltr_finder=~s/ltr_finder\n?//;
$ltr_finder="$ltr_finder/" if $ltr_finder ne '' and $ltr_finder !~ /\/$/;
die "LTR_FINDER is not exist in the path $ltr_finder!\n" unless -X "${ltr_finder}ltr_finder";
print "\nUsing this LTR_FINDER: $ltr_finder\n"; #test

if ( ! $seq_path and (! $check_dependencies) ){
    pod2usage( {
           -message => "At least 1 parameter mandatory:\n1) Input fasta file: -seq\n".
           "$usage\n\n",
           -verbose => 0,
           -exitval => 2 } );
}
print "Pass!\n";
exit if $check_dependencies;

# make a softlink to the genome file
my $seq_file = basename($seq_path);
`ln -s $seq_path $seq_file` unless -s $seq_file;


if ($threads == 1){
# run the original single threaded code
`${ltr_finder}ltr_finder $finder_para $seq_file > $seq_file.finder.scn`;
if ($harvest_format == 0){
	`mv $seq_file.finder.scn $seq_file.finder.combine.scn`;
	} else {
	`perl $convert $seq_file.finder.scn > $seq_file.finder.combine.scn`;
	}

} else {
## run the paralleled code
# read genome in memory
open SEQ, "<$seq_file" or die $usage;
open GFF, ">$seq_file.finder.combine.gff3" or die $!;
print GFF "##gff-version   3\n";
my %seq; #a hash to store seq name and length info
my %order; #store seq order in genome
my $chr_info; #store seq id in genome
$i=0;
$/ = "\n>";
while (<SEQ>){
	s/>//g;
	my ($id, $seq) = (split /\n/, $_, 2);
	$seq =~ s/\s+//g;
	my $len = length $seq;
	print GFF "##sequence-region   $id 1 $len\n";
	$chr_info.="#$id\n";
	$seq{$id} = $len;
	$order{$id} = $i;
	$i++;
	}
print GFF "$chr_info";
close SEQ;
$/="\n";

goto Next if $next == 1; #run the next step if all LTR_finder processes are done

# prepre files
`mkdir $seq_file.finder` unless -d "$seq_file.finder";
chdir "$seq_file.finder";
`perl $cut ../$seq_file -s -l $size_of_each_piece`;

##multi-threading using queue, create a worker module for parallel computation
my $process_q=Thread::Queue->new();
sub worker {
	while (my $seq = $process_q -> dequeue()){
		chomp $seq;
		$seq =~ s/\s+//;
		print localtime() ." CPU".threads -> self() -> tid().": running on $seq\n";
		`timeout $timeout ${ltr_finder}ltr_finder $finder_para $seq > $seq.finder.scn`; #set the max running time for a thread as $timeout seconds
		if ($? ne 0 and $try1 ne 0){
			print localtime() ." CPU".threads -> self() -> tid().": $seq timeout, process it with the salvage mode\n";
			my $in=`perl $script_path/LTR_FINDER_parallel -seq $seq -size 50000 -time 10 -try1 0 -threads 1 -cut $cut -finder $ltr_finder`;
			`mv $seq.finder.combine.scn $seq.finder.scn`;
			`rm -rf ${seq}.finder`; #remove?
			}
		}
	}

#insert seq names into the worker queue
open List, "<../$seq_file.list" or die $!;
$process_q -> enqueue (<List>);
$process_q -> end(); #stop adding items to the queue
close List;

#work and finish
for (1..$threads){
	threads -> create ( \&worker );
	}
foreach my $thr (threads -> list()){
	$thr -> join();
	}

chdir "../";

Next:
#combine split ltr_finder results
open Out, ">$seq_file.finder.combine.scn" or die $!;

#print out headers
if ($harvest_format == 0){
	print Out "index	SeqID	Location	LTR len	Inserted element len	TSR	PBS	PPT	RT	IN (core)	IN (c-term)	RH	Strand	Score	Sharpness	Similarity\n";
	} else {
	print Out "#LTR_FINDER_parallel -seq $seq_file -size $size_of_each_piece -time $timeout -try1 $try1 -harvest_out -threads $threads -cut $cut -finder $ltr_finder
# LTR_FINDER args=$finder_para
# LTR_FINDER_parallel version=$version
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr chr
# where:
# s = starting position
# e = ending position
# l = length
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# sim = similarity
# seq-nr = sequence order\n";
	}

# process each candidates
open List, "<$seq_file.list" or die $!;
my $count = 1; #count repeats in gff
foreach my $seq (<List>){
	$seq =~ s/\s+//;
	my ($base, $order) = ($seq, 1);
	($base, $order) = ($1, $2) if $seq =~ /(.*)_sub([0-9]+)$/;
	my $coord_adj = ($order - 1) * $size_of_each_piece;
	next unless -e "$seq_file.finder/$seq.finder.scn";
	open Scn, "<$seq_file.finder/$seq.finder.scn" or die $!;
	while (<Scn>){
		next unless /^\[/;
		s/^\[\s+[0-9]+\]/\[NA\]/;
		my ($index, $id, $loc, $ltr_len, $ele_len, $TSR, $PBS, $PPT, $RT, $IN_core, $IN_cterm, $RH, $strand, $score, $sharpness, $sim) = (split);

		#convert coordinates back to the genome scale
		my @coord = ($loc, $PBS, $PPT, $RT, $IN_core, $IN_cterm, $RH);
		my $i = -1;
		foreach (@coord) {
			$i++;
			next if /^N-N$/;
			my ($start, $end) = ($1+$coord_adj, $2+$coord_adj) if /([0-9]+)\-([0-9]+)/;
			$coord[$i] = "$start-$end";
			}
		my ($start, $end) = ($1, $2) if $coord[0] =~ /([0-9]+)\-([0-9]+)/;
		my ($lltr, $rltr) = ($1, $2) if $ltr_len=~/([0-9]+),([0-9]+)/; 
		my ($lltr_e, $rltr_s) = ($start+$lltr-1, $end-$rltr+1);

		#output LTRharvest or LTR_FINDER (-w 2) format
		if ($harvest_format == 0){
			print Out "[NA]\t$base\t$coord[0]\t$ltr_len\t$ele_len\t$TSR\t$coord[1]\t$coord[2]\t$coord[3]\t$coord[4]\t$coord[5]\t$coord[6]\t$strand\t$score\t$sharpness\t$sim\n";
			} else {
			$sim*=100;
			print Out "$start $end $ele_len $start $lltr_e $lltr $rltr_s $end $rltr $sim $order{$base} $base\n";
			}

		#print GFF format
		my $chr = $base;
		print GFF "$chr\t$annotator\trepeat_region\t$start\t$end\t.\t$strand\t.\tID=repeat_region$count\n";
		#print GFF "$chr\t$annotator\ttarget_site_duplication\t$lTSD\t.\t$strand\t.\tParent=repeat_region$count\n" unless $TSD eq "NA";
		print GFF "$chr\t$annotator\tLTR_retrotransposon\t$start\t$end\t.\t$strand\t.\tID=LTR_retrotransposon$count;Parent=repeat_region$count;tsd=$TSR;ltr_identity=$sim;seq_number=$order{$chr}\n";
		print GFF "$chr\t$annotator\tlong_terminal_repeat\t$start\t$lltr_e\t.\t$strand\t.\tParent=LTR_retrotransposon$count\n";
		print GFF "$chr\t$annotator\tlong_terminal_repeat\t$rltr_s\t$end\t.\t$strand\t.\tParent=LTR_retrotransposon$count\n";
		#print GFF "$chr\t$annotator\ttarget_site_duplication\t$rTSD\t.\t$strand\t.\tParent=repeat_region$count\n" unless $TSD eq "NA";
		print GFF "###\n";

		}
	close Scn;
	$count++;
	}
close List;
#close Scn;
close GFF;
`rm -rf ./$seq_file.finder 2>/dev/null` if $verbose eq 0;
}

print localtime() ." Job finished! Check out $seq_file.finder.combine.scn\n";

